'''
    Jane Austen Textual Analysis - Dataset Creation Script
    By: Tyler Duckworth
    
    This Python script generates the metadata and main dataset 
    for this project. The main dataset consists of four .tsv files
    that contain the tokenized text of four Jane Austen novels.
'''
# built-in packages
import re
import os
import sys
import argparse
from pathlib import Path

# third-party packages
import nltk
import nltk.corpus
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
import pandas as pd
from unicodedata import normalize
import requests

def tokenize_passage(passage, stemmer, stopwords):
    '''
    Given a passage of the book, apply various transformations to it
    
    Returns the number of tokens and a tokenized string.
    
    Transformations:
    - Filter out proper nouns
    - Fix various issues when going from Unicode to ASCII
    '''
    tokens = [
        stemmer.stem(normalize('NFKD', word.strip()).encode('ascii', 'ignore').decode('ascii').lower()).replace('-', '').replace("'", '').strip() 
        for word, pos in passage 
        if pos != 'NNP' and pos != 'NNPS' and len(word) > 1 and word.lower() not in stopwords
    ]
    return len(tokens), ' '.join(tokens)

def create_metadata(output_path):
    '''
    Returns the metadata for this dataset containing download URLs and other information
        
    This method is hardcoded just to simplify the overall flow. This could be improved 
    if the dataset needs to be expanded.
    '''
    df = pd.DataFrame([
        [1, 'Persuasion', 1818, 'https://www.gutenberg.org/cache/epub/105/pg105.txt'],
        [2, 'Pride and Prejudice', 1813, 'https://www.gutenberg.org/cache/epub/42671/pg42671.txt'],
        [3, 'Emma', 1816, 'https://www.gutenberg.org/cache/epub/158/pg158.txt'],
        [4, 'Sense and Sensibility', 1811, 'https://www.gutenberg.org/cache/epub/21839/pg21839.txt'],
    ], columns=['id', 'title', 'publish_year', 'url'])
    
    df.to_csv(f"{output_path}/jane_austen_metadata.tsv", sep='\t', index=False)
    print(f"Created metadata file at {output_path}/jane_austen_metadata.tsv")
    return df

def process_data_file(input_path, output_path):
    '''
    Main driver function to process a single, raw .txt file.
    
    Args:
    - input_path - File path to the raw text file
    - output_path - Path to save the TSV output dataset
    '''
    stopwords = set(nltk.corpus.stopwords.words('english'))
    src = open(input_path, "r", encoding='utf-8')
    content = src.read()
    src.close()
    
    # split into volumes (if any) then chapters
    content = re.sub(r'^VOL(\.?)(\s)', r'\n\nVOLUME\2', content, flags=re.MULTILINE)
    start_ind = content.index("\n\nCHAPTER I")
    if "\n\nVOLUME I" in content:
        start_ind = content.index("\n\nVOLUME I")
    end_ind = content.index("*** END OF THE PROJECT GUTENBERG EBOOK")
    
    # isolate main text
    data = content[start_ind:end_ind]
    
    # some of the novels have multiple volumes
    # remove any misc data between the start of a volume and its first chapter
    pattern = re.compile(r'(END OF VOLUME\s+[IVXLCDM]+.*?)\n.*?\n(VOLUME\s+[IVXLCDM]+)', re.DOTALL)
    data = re.sub(pattern, r'\n\n\2', data)
    pattern = re.compile(r'(VOLUME\s+[IVXLCDM]+.*?)\n.*?\n(CHAPTER\s+[IVXLCDM]+)', re.DOTALL)
    data = re.sub(pattern, r'\1\n\n\2', data)
    
    # replace volume/chapter headings with # and + for easy searching
    data = re.sub(r'^VOLUME.*$', '#', data, flags=re.MULTILINE)
    data = re.sub(r'^CHAPTER.*$', '+', data, flags=re.MULTILINE)
    
    # remove square bracket sections (usually descriptions of illustrations)
    data = re.sub(r'\[.*?\].*|\b[A-Z][A-Z\s]*\.\n?', '', data, flags=re.MULTILINE)
    data = re.sub(r'(--{2,4})(?=.*\1)|\d+', ' ', data, flags=re.MULTILINE)
    
    # remove punctuation
    data = data.replace(u'’', '\'')
    data = re.sub(r'[()\[\]_{},\‘’.;:!?—“”"&]', ' ', data, flags=re.MULTILINE)
    # replace 2+ \n with single \n
    data = re.sub(r'\n{2,}', '\n', data, flags=re.MULTILINE)
    data = data.strip()

    # appends custom stopwords that are noticable in these works to reduce noise
    # must - 'mustn' and "mustn't" occur in the stopword collection but 'must' does not
    stopwords = stopwords | set(['could', 'would', 'should', 'said', 'however', 'must', "can't"])
    stemmer = SnowballStemmer(language='english')

    # split corpus into array of volumes (if necessary) and array of chapters
    id = 1 # id of each block 
    vol_no = 1
    res = []
    for vol in data.strip().split("#"):
        if not vol: continue
        ch_no = 1
        for ch in vol.strip().split("+"):
            if not ch: continue
            ch = ch.strip().replace('\n', ' ').replace('--', ' ').replace('-', ' ').replace('\'', '')
            ch_w_pos = nltk.tag.pos_tag(ch.split())
            
            num_tokens, tokens = tokenize_passage(ch_w_pos, stemmer, stopwords)
            res.append([id, vol_no, ch_no, num_tokens, tokens])
            id += 1
            ch_no += 1
        vol_no += 1

    df = pd.DataFrame(res, columns=['id', 'volume_no', 'chapter_no', 'length', 'tokens'])
    
    # calculate absolute and relative position data using combined dataset
    total_length = df.loc[:,'length'].sum()
    df['rel_len'] = df.loc[:,'length'] / total_length
    df['start'] = df.loc[:,'length'].cumsum() -  df.loc[:,'length']
    df['end'] = df.loc[:,'length'].cumsum()
    df['rel_start'] = df['start'] / total_length
    
    df.to_csv(output_path, sep='\t', index=False)

    
def process_file(row, output_path):
    '''
    Retrieves the raw text from Project Gutenberg's website and 
    creates the dataset from it.
    '''
    print(f"Processing {row['title']}:")
    r = requests.get(row['url'], allow_redirects=True)
    name = row['title'].lower().replace(' ', '_')
    file = open(f'{name}.txt', 'wb')
    file.write(r.content)
    file.close()
    print(f"\tDownloaded file to {output_path}/{name}.txt")
    process_data_file(f'{name}.txt', f'{output_path}/{name}.tsv')
    print(f"\tProcessed text and output to {output_path}/{name}.tsv")
    print("\tFinished processing.")
    
    
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Creates the metadata and dataset for four Jane Austen novels')
    parser.add_argument("output_path", help="Directory to store output")
    args = parser.parse_args()
    
    
    try:
        nltk.data.find('stopwords')
        nltk.data.find('averaged_perceptron_tagger')
    except LookupError:
        # download NLTK datasets if they don't already exist
        nltk.download('stopwords') 
        nltk.download('averaged_perceptron_tagger')
    # create output directory if it doesn't exist already
    if not os.path.exists(args.output_path):
        try:
            os.mkdir(args.output_path)
        except OSError as ex:
            print(f"Error while creating directory at {args.output_path}.\nFull error:")
            print(ex)
            sys.exit(0)
    
    metadata = create_metadata(args.output_path)
    _ = metadata.apply(process_file, args=(args.output_path,), axis=1)